Generate sample data for the Items and Baskets



In [253]:

    
# Generate item ids

item_ids = range(1, 101) # 100 items, numbered from 1 to 100
print items









    



[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]



In [254]:

    
# Item i is in basket b if and only if i divides b with no remainder. 
# Thus, item 1 is in all the baskets, item 2 is in all fifty of the 
# even-numbered baskets, and so on

# Item Generator
def generate_items(basket_id, item_ids):
    """
    Generate items belonging to a given basket id
    
    :param: basket_id: the id of the basket
    :param: items: the list of items
    :return: set of items whose ids divide the basket id with no remainder
    """
    basket_items = {item for item in item_ids if item % basket_id == 0}
    return basket_items


# Baskets Generator
baskets = {b: generate_items(b, item_ids) for b in range(1, 101)} 


# View contents of Basket #:20
print "Basket ID 20: ", baskets[20]









    



Basket ID 20:  set([40, 60, 20, 80, 100])



In [269]:

    
# Generate the singleton item sets
# here, we use tuples instead of sets since we want to use them as keys for dicts

items = [tuple([x]) for x in item_ids]
print items









    



[(1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,), (10,), (11,), (12,), (13,), (14,), (15,), (16,), (17,), (18,), (19,), (20,), (21,), (22,), (23,), (24,), (25,), (26,), (27,), (28,), (29,), (30,), (31,), (32,), (33,), (34,), (35,), (36,), (37,), (38,), (39,), (40,), (41,), (42,), (43,), (44,), (45,), (46,), (47,), (48,), (49,), (50,), (51,), (52,), (53,), (54,), (55,), (56,), (57,), (58,), (59,), (60,), (61,), (62,), (63,), (64,), (65,), (66,), (67,), (68,), (69,), (70,), (71,), (72,), (73,), (74,), (75,), (76,), (77,), (78,), (79,), (80,), (81,), (82,), (83,), (84,), (85,), (86,), (87,), (88,), (89,), (90,), (91,), (92,), (93,), (94,), (95,), (96,), (97,), (98,), (99,), (100,)]

Support

Support of an item I is given by the number of baskets of which I is a subset. For example, if total number of baskets is 100 and item 2 is present in 50 of them, then the support for item 2 is 50.



In [270]:

    
def get_support_values(items, baskets):
    """
    Find and return support value of the items

    Support of an item I is given by the number of baskets of which I is a subset
    :param items: List of item subsets. Each list subset can contain one or more items
    :param baskets: A dict of baskets with basket id as key and basket contents as sets
    :return: support_values: A dict of Support value for each of the item subsets 
    """
    # get the baskets that each itemsets are part of
    item_baskets = {item: [basket_id for basket_id in baskets.keys() 
                                 if set(item) <= baskets[basket_id]] for item in items}  
    
    # get the number of baskets (support value) that each itemsets are part of
    support_values = { item: len(item_baskets[item]) for item in item_baskets.keys()}
    
    return support_values, item_baskets



In [271]:

    
support_values, item_baskets = get_support_values(items, baskets)
print support_values









    



{(32,): 6, (23,): 2, (99,): 6, (74,): 4, (49,): 3, (24,): 8, (15,): 4, (91,): 4, (66,): 8, (41,): 2, (16,): 5, (7,): 2, (83,): 2, (58,): 4, (33,): 4, (8,): 4, (100,): 9, (75,): 6, (50,): 6, (25,): 3, (92,): 6, (67,): 2, (42,): 8, (17,): 2, (84,): 12, (59,): 2, (34,): 4, (9,): 3, (76,): 6, (51,): 4, (26,): 4, (1,): 1, (93,): 4, (68,): 6, (43,): 2, (18,): 6, (85,): 4, (60,): 12, (35,): 4, (10,): 4, (77,): 4, (52,): 6, (27,): 4, (2,): 2, (94,): 4, (69,): 4, (44,): 6, (19,): 2, (86,): 4, (61,): 2, (36,): 9, (11,): 2, (78,): 8, (53,): 2, (28,): 6, (3,): 2, (95,): 4, (70,): 8, (45,): 6, (20,): 6, (96,): 12, (87,): 4, (62,): 4, (37,): 2, (12,): 6, (88,): 8, (79,): 2, (54,): 8, (29,): 2, (4,): 3, (80,): 10, (71,): 2, (46,): 4, (21,): 4, (97,): 2, (72,): 12, (63,): 6, (38,): 4, (13,): 2, (89,): 2, (64,): 7, (55,): 4, (30,): 8, (5,): 2, (81,): 5, (56,): 8, (47,): 2, (22,): 4, (98,): 6, (73,): 2, (48,): 10, (39,): 4, (14,): 4, (90,): 12, (65,): 4, (40,): 8, (31,): 2, (6,): 4, (82,): 4, (57,): 4}

Frequent Itemsets

Frequent Itemsets are those with a support value exceeding a given support threshold.



In [272]:

    
def find_frequent_items(support_values, support_threshold):
    """
    Return item subsets with support values greater than support_threshold

    :param support_values: List of support values corresponding to the items subsets
    :param support_threshold: items with support value greater than support_threshold are 'frequent'
    :return: list of item sets with support value greater than the threshold
    """
    
    support_threshold_exceeded = [item for item in support_values.keys() if support_values[item] >= support_threshold]
    return support_threshold_exceeded



In [278]:

    
frequent_singletons = find_frequent_items(support_values, 5)
print frequent_singletons









    



[(32,), (99,), (24,), (66,), (16,), (100,), (75,), (50,), (92,), (42,), (84,), (76,), (68,), (18,), (60,), (52,), (44,), (36,), (78,), (28,), (70,), (45,), (20,), (96,), (12,), (88,), (54,), (80,), (72,), (63,), (64,), (30,), (81,), (56,), (98,), (48,), (90,), (40,)]

Doubletons

Doubletons are sets with two items. Let us calculate the support values and identify frequent doubletons :



In [279]:

    
import itertools

# generate all possible two item combinations
two_itemsets = list(itertools.combinations(item_ids, 2))

# get the support values and the baskets for the doubleton item sets
doubletons_support_values, doubletons_baskets = get_support_values(two_itemsets, baskets)

# get the frequent doubleons with support threshold = 5
frequent_doubletons = find_frequent_items(doubletons_support_values, 5)
print frequent_doubletons









    



[(12, 72), (32, 96), (24, 84), (36, 54), (32, 48), (54, 90), (20, 60), (16, 32), (36, 60), (60, 96), (40, 100), (18, 90), (72, 96), (18, 54), (42, 84), (45, 90), (60, 84), (48, 72), (84, 96), (12, 84), (48, 96), (24, 72), (16, 96), (48, 60), (60, 72), (40, 60), (36, 48), (24, 60), (20, 40), (28, 56), (32, 64), (48, 64), (36, 84), (12, 36), (20, 100), (36, 90), (36, 72), (18, 36), (50, 100), (56, 84), (60, 100), (24, 96), (30, 90), (30, 60), (64, 80), (16, 80), (24, 48), (48, 84), (72, 90), (54, 72), (12, 48), (20, 80), (64, 96), (80, 96), (32, 80), (18, 72), (48, 80), (28, 84), (12, 96), (16, 48), (72, 84), (44, 88), (60, 80), (40, 80), (36, 96), (24, 36), (12, 24), (80, 100), (16, 64), (12, 60), (60, 90)]

Confidence

The Confidence of the association rule $ I \longrightarrow j $ is the ratio of the support of $ I \cup \{j\} $ to the support of I. That is, the confidence of the rule is the fraction of baskets with all of I that also contain j



In [319]:

    
# Warning: Naive Implementation
# Refer next notebooks for a faster implementation

def calc_confidence(itemset1, itemset2, item_ids, baskets):
    """
    Calculate the confidence of association rule itemset1 -> itemset2

    The 'confidence' is defined as the ratio of support for (itemset1 U itemset2) to
    the support for itemset1 only.
    :param itemset1: set containing itemset 1
    :param itemset2: set containing itemset 2
    :param item_ids: list of item ids
    :param baskets: list of all baskets
    :return: support (itemset1 U itemset2) / support(itemset1)
    """
    
    # find the length of the union of itemset1 & itemset2
    itemset_len = len(itemset1.union(itemset2))
    
    # generate all possible item combinations of length of the combined set
    all_itemsets = list(itertools.combinations(item_ids, itemset_len))

    
    # generate the support values & baskets for the length of combined set
    support_values, baskets_info = get_support_values(all_itemsets, baskets)
    
    # get the support value for itemset1 union itemset2
    union_itemset_support = support_values[tuple(sorted(list(itemset1.union(itemset2))))]
    
    # get support value for itemset1
    itemset1_baskets = [tuple(basket_id) for itemset, basket_id in baskets_info.iteritems() if itemset1 <= set(itemset)]
    itemset1_support = len(set(itemset1_baskets))
    
    return union_itemset_support / (itemset1_support * 1.0)



In [320]:

    
print calc_confidence({4,12}, {2}, item_ids, baskets)









    



0.666666666667



In [322]:

    
print calc_confidence({24,60}, {8}, item_ids, baskets)

0.5



In [323]:

    
# Warning : takes a long time
print calc_confidence({2,3,4}, {5}, item_ids, baskets)

1.0